Movie Success Factor Analysis: Financial and Critical Perspectives (1996–2024)

Reproducing and Extending Gao et al. (2019) Analysis

Author

Abhishek Chavan, Navya Gulati

Published

December 9, 2025

Research Questions

  1. What factors contribute to both financial AND critical success in movies?
  2. How have the determinants of movie success evolved over time?
  3. Do actor and director career histories and collaboration patterns remain predictive?
  4. What role do genre, runtime, and creative elements play in predicting success?

Data Loading & Preprocessing

# Core libraries
library(tidyverse)
library(ggplot2)
library(corrplot)
library(gridExtra)
library(scales)
library(patchwork)
library(car)
library(broom)
library(lmtest)
library(viridis)
library(ggridges)
library(kableExtra)

options(stringsAsFactors = FALSE)

# Set up directories
root_dir        <- getwd()
data_raw_dir    <- file.path(root_dir, "data", "raw")
data_proc_dir   <- file.path(root_dir, "data", "processed")
results_dir     <- file.path(root_dir, "results")
results_fin_dir <- file.path(results_dir, "final")
figures_dir     <- file.path(root_dir, "figures")

dirs_to_create <- c(data_proc_dir, results_dir, results_fin_dir, figures_dir)
for (d in dirs_to_create) {
  if (!dir.exists(d)) dir.create(d, recursive = TRUE)
}

Utility Functions

clean_currency <- function(x) {
  x_clean <- gsub("[\\$,]", "", x)
  x_clean[x_clean == "" | is.na(x_clean)] <- NA
  suppressWarnings(as.numeric(x_clean))
}

save_table <- function(df, filename, final = FALSE) {
  out_dir <- if (final) results_fin_dir else file.path(results_dir, "preliminary")
  if (!dir.exists(out_dir)) dir.create(out_dir, recursive = TRUE)
  readr::write_csv(df, file.path(out_dir, filename))
}

save_plot <- function(p, filename, width = 12, height = 7, dpi = 300) {
  ggsave(file.path(figures_dir, filename), p,
         width = width, height = height, dpi = dpi, bg = "white")
}

theme_professional <- function() {
  theme_minimal(base_size = 12) +
    theme(
      plot.title    = element_text(face = "bold", size = 16, hjust = 0, colour = "#1a1a1a"),
      plot.subtitle = element_text(size = 12, colour = "gray40", hjust = 0,
                                   margin = margin(b = 10)),
      plot.caption  = element_text(size = 9, colour = "gray60", hjust = 0,
                                   margin = margin(t = 10)),
      axis.title    = element_text(size = 11, face = "bold", colour = "#1a1a1a"),
      axis.text     = element_text(size = 10, colour = "gray30"),
      legend.position = "right",
      legend.title  = element_text(face = "bold", size = 10),
      legend.text   = element_text(size = 9),
      panel.grid.major = element_line(colour = "gray95", size = 0.3),
      panel.grid.minor = element_blank(),
      panel.background = element_rect(fill = "white", colour = NA),
      plot.background  = element_rect(fill = "white", colour = NA),
      plot.margin      = margin(15, 15, 15, 15)
    )
}

format_table <- function(df, caption = "", digits = 2) {
  # Convert numeric columns to specified digits
  df_formatted <- df %>%
    mutate(across(where(is.numeric), ~round(., digits)))
  
  # Create markdown table with proper alignment
  cat("\n")
  cat(knitr::kable(df_formatted, format = "markdown", caption = caption))
  cat("\n\n")
  invisible(df)
}

Loading Raw Data

box_office       <- readr::read_csv(file.path(data_raw_dir, "box_office_data.csv"), 
                                    show_col_types = FALSE)
name_basics      <- readr::read_tsv(file.path(data_raw_dir, "name.basics.tsv.gz"),
                                    show_col_types = FALSE)
title_basics     <- readr::read_tsv(file.path(data_raw_dir, "title.basics.tsv.gz"),
                                    show_col_types = FALSE)
title_principals <- readr::read_tsv(file.path(data_raw_dir, "title.principals.tsv.gz"),
                                    show_col_types = FALSE)
title_ratings    <- readr::read_tsv(file.path(data_raw_dir, "title.ratings.tsv.gz"),
                                    show_col_types = FALSE)

cat("Loaded datasets:\n")
Loaded datasets:
cat("- Box Office:", nrow(box_office), "records\n")
- Box Office: 2600 records
cat("- Title Basics:", nrow(title_basics), "records\n")
- Title Basics: 12083771 records
cat("- Title Ratings:", nrow(title_ratings), "records\n")
- Title Ratings: 1603933 records
cat("- Title Principals:", nrow(title_principals), "records\n")
- Title Principals: 94051705 records

Data Cleaning & Integration

# === 1. CLEAN BOX OFFICE DATA ===
box_office_clean <- box_office %>%
  mutate(
    worldwide_revenue = clean_currency(Worldwide),
    domestic_revenue  = clean_currency(Domestic),
    foreign_revenue   = clean_currency(Foreign)
  ) %>%
  select(Title, worldwide_revenue, domestic_revenue, foreign_revenue) %>%
  rename(primaryTitle = Title) %>%
  distinct(primaryTitle, .keep_all = TRUE)

cat("Box office cleaned:", nrow(box_office_clean), "unique titles\n")
Box office cleaned: 2587 unique titles
# === 2. CLEAN TITLE BASICS (movies only, 1996–2024) ===
title_basics_clean <- title_basics %>%
  filter(titleType == "movie") %>%
  mutate(
    startYear      = suppressWarnings(as.numeric(startYear)),
    runtimeMinutes = suppressWarnings(as.numeric(runtimeMinutes)),
    isAdult        = as.logical(isAdult)
  ) %>%
  filter(!is.na(startYear), startYear >= 1996, startYear <= 2024) %>%
  filter(!is.na(runtimeMinutes), runtimeMinutes > 0) %>%
  select(tconst, primaryTitle, startYear, runtimeMinutes, genres, isAdult) %>%
  rename(release_year = startYear, runtime_minutes = runtimeMinutes)

cat("Cleaned title_basics to", nrow(title_basics_clean), "movies (1996–2024)\n")
Cleaned title_basics to 284528 movies (1996–2024)
# === 3. CLEAN RATINGS ===
title_ratings_clean <- title_ratings %>%
  select(tconst, averageRating, numVotes) %>%
  rename(imdb_rating = averageRating, num_votes = numVotes)

# === 4. MERGE DATASETS ===
movies_base <- title_basics_clean %>%
  left_join(title_ratings_clean, by = "tconst") %>%
  left_join(box_office_clean, by = "primaryTitle", relationship = "many-to-one")

cat("Base movie dataset:", nrow(movies_base), "rows\n")
Base movie dataset: 284528 rows
cat("- With ratings:", sum(!is.na(movies_base$imdb_rating)), "\n")
- With ratings: 187236 
cat("- With revenue (any type):", sum(!is.na(movies_base$worldwide_revenue) | !is.na(movies_base$domestic_revenue)), "\n")
- With revenue (any type): 4195 
cat("- With both worldwide revenue AND ratings:", sum(!is.na(movies_base$imdb_rating) & !is.na(movies_base$worldwide_revenue)), "\n")
- With both worldwide revenue AND ratings: 3799 

Feature Engineering & Analysis Dataset

# 1: Movies with BOTH worldwide revenue and ratings
movies_both <- movies_base %>%
  filter(!is.na(worldwide_revenue),
         !is.na(imdb_rating),
         !is.na(genres),
         genres != "\\N") %>%
  distinct(tconst, .keep_all = TRUE) %>%
  mutate(data_quality = "Both_Revenue_Rating") %>%
  arrange(release_year)

# 2: Added movies with domestic revenue (but no worldwide) + ratings
movies_domestic_only <- movies_base %>%
  filter(is.na(worldwide_revenue),
         !is.na(domestic_revenue),
         !is.na(imdb_rating),
         !is.na(genres),
         genres != "\\N") %>%
  distinct(tconst, .keep_all = TRUE) %>%
  mutate(
    worldwide_revenue = domestic_revenue * 1.5,
    data_quality = "Domestic_Only_Estimated"
  ) %>%
  arrange(release_year)

#3: Added movies with ratings but NO revenue
movies_ratings_only <- movies_base %>%
  filter(is.na(worldwide_revenue),
         is.na(domestic_revenue),
         !is.na(imdb_rating),
         !is.na(genres),
         genres != "\\N",
         num_votes >= 5000) %>%
  distinct(tconst, .keep_all = TRUE) %>%
  mutate(
    imdb_rating_scaled = (imdb_rating - min(imdb_rating)) / (max(imdb_rating) - min(imdb_rating)),
    votes_scaled = (log(num_votes + 1) - min(log(num_votes + 1))) / 
                   (max(log(num_votes + 1)) - min(log(num_votes + 1))),
    worldwide_revenue = (imdb_rating_scaled * 0.4 + votes_scaled * 0.6) * 100e6,
    data_quality = "Ratings_Only_Estimated"
  ) %>%
  select(-imdb_rating_scaled, -votes_scaled) %>%
  arrange(release_year)

# Combine 
movies_analysis <- bind_rows(movies_both, movies_domestic_only, movies_ratings_only) %>%
  distinct(tconst, .keep_all = TRUE) %>%
  arrange(release_year)

cat("\n=== COMBINED DATASET ===\n")

=== COMBINED DATASET ===
cat("Total movies:", nrow(movies_analysis), "\n")
Total movies: 14841 
print(table(movies_analysis$data_quality))

   Both_Revenue_Rating Ratings_Only_Estimated 
                  3783                  11058 
movies_ratings_only_lower <- movies_base %>%
filter(is.na(worldwide_revenue),
        is.na(domestic_revenue),
        !is.na(imdb_rating),
        !is.na(genres),
        genres != "\\N",
        num_votes >= 1000,
        num_votes < 5000) %>%
distinct(tconst, .keep_all = TRUE) %>%
mutate(
    imdb_rating_scaled = (imdb_rating - min(imdb_rating)) / (max(imdb_rating) - min(imdb_rating)),
    votes_scaled = (log(num_votes + 1) - min(log(num_votes + 1))) / 
                    (max(log(num_votes + 1)) - min(log(num_votes + 1))),
    worldwide_revenue = (imdb_rating_scaled * 0.4 + votes_scaled * 0.6) * 50e6,
    data_quality = "Ratings_Lower_Votes_Estimated"
) %>%
select(-imdb_rating_scaled, -votes_scaled) %>%
arrange(release_year)

movies_analysis <- bind_rows(movies_analysis, movies_ratings_only_lower) %>%
distinct(tconst, .keep_all = TRUE) %>%
arrange(release_year)

cat("\nFinal dataset:", nrow(movies_analysis), "movies\n")

Final dataset: 33501 movies
print(table(movies_analysis$data_quality))

          Both_Revenue_Rating Ratings_Lower_Votes_Estimated 
                         3783                         18660 
       Ratings_Only_Estimated 
                        11058 

Genre Binary Variables

main_genres <- c(
  "Action", "Comedy", "Drama", "Horror", "Romance",
  "Thriller", "Adventure", "Animation", "Crime", "Family",
  "Mystery", "Fantasy", "Documentary", "Biography", "Sci-Fi"
)

for (g in main_genres) {
  col_name <- paste0("genre_", tolower(g))
  movies_analysis <- movies_analysis %>%
    mutate(!!col_name := as.numeric(str_detect(genres, g)))
}

movies_analysis <- movies_analysis %>%
  mutate(num_genres = rowSums(select(., starts_with("genre_"))))

cat("Created", length(main_genres), "genre binary indicators\n")
Created 15 genre binary indicators
cat("\nGenre distribution:\n")

Genre distribution:
genre_dist <- colSums(select(movies_analysis, starts_with("genre_")))
print(sort(genre_dist, decreasing = TRUE))
      genre_drama      genre_comedy      genre_action    genre_thriller 
            18455             10986              6333              5717 
    genre_romance       genre_crime      genre_horror   genre_adventure 
             5266              5127              4395              3235 
    genre_mystery genre_documentary   genre_biography     genre_fantasy 
             2981              2346              1942              1699 
     genre_sci-fi   genre_animation      genre_family 
             1469              1283              1281 

Success Metrics

avg_rating <- mean(movies_analysis$imdb_rating, na.rm = TRUE)
cat("Average IMDb Rating:", round(avg_rating, 2), "\n\n")
Average IMDb Rating: 6.11 
movies_analysis <- movies_analysis %>%
  mutate(
    estimated_budget  = worldwide_revenue * 0.30,
    roi               = (worldwide_revenue - estimated_budget) / estimated_budget,
    financial_success = as.numeric(roi >= 1),
    critical_success  = as.numeric(imdb_rating > avg_rating),
    dual_success      = as.numeric(financial_success & critical_success),
    popularity_score  = log10(num_votes + 1),
    revenue_per_minute= worldwide_revenue / runtime_minutes,
    log_revenue       = log(worldwide_revenue + 1),
    log_votes         = log(num_votes + 1),
    period = if_else(release_year <= 2016, "1996-2016", "2017-2024"),
    decade = paste0((release_year %/% 10) * 10, "s")
  )

success_summary <- movies_analysis %>%
  summarise(
    Total_Movies              = n(),
    Financial_Success_Count   = sum(financial_success),
    Financial_Success_Pct     = round(mean(financial_success) * 100, 1),
    Critical_Success_Count    = sum(critical_success),
    Critical_Success_Pct      = round(mean(critical_success) * 100, 1),
    Dual_Success_Count        = sum(dual_success),
    Dual_Success_Pct          = round(mean(dual_success) * 100, 1)
  )

cat("=== SUCCESS METRICS SUMMARY ===\n")
=== SUCCESS METRICS SUMMARY ===
print(success_summary)
# A tibble: 1 × 7
  Total_Movies Financial_Success_Count Financial_Success_Pct
         <int>                   <dbl>                 <dbl>
1        33501                   33501                   100
# ℹ 4 more variables: Critical_Success_Count <dbl>, Critical_Success_Pct <dbl>,
#   Dual_Success_Count <dbl>, Dual_Success_Pct <dbl>
save_table(success_summary, "00_success_summary.csv", final = TRUE)

readr::write_csv(movies_analysis, 
                 file.path(data_proc_dir, "movies_analysis.csv"))
cat("\n✓ Final dataset:", nrow(movies_analysis), "movies\n")

✓ Final dataset: 33501 movies

Exploratory Data Analysis (EDA) – Aligned with Results

EDA 1: Overall Rating Distribution

p_eda_rating <- ggplot(movies_analysis, aes(x = imdb_rating)) +
  geom_histogram(aes(y = after_stat(density)), bins = 30, fill = "#2E86AB", 
                 alpha = 0.7, colour = "white", size = 0.5) +
  geom_density(colour = "#A23B72", size = 1.3, alpha = 0.8) +
  geom_vline(aes(xintercept = mean(imdb_rating)), colour = "#F18F01",
             linetype = "dashed", size = 1.1) +
  annotate("text", x = 8, y = 0.45, 
           label = sprintf("Mean = %.2f", mean(movies_analysis$imdb_rating)), 
           colour = "#F18F01", size = 4, fontface = "bold") +
  scale_x_continuous(breaks = seq(1, 10, 1)) +
  labs(
    title = "EDA: Overall Rating Distribution",
    subtitle = "Baseline for Results 2 & 5 – slightly left-skewed with mean ~6.24",
    x = "IMDb Rating (1–10)", y = "Density"
  ) +
  theme_professional()

print(p_eda_rating)

save_plot(p_eda_rating, "eda_01_rating_distribution.png", width = 13, height = 7)

EDA 2: Revenue Distribution by Decade

movies_plot <- movies_analysis %>%
  mutate(decade_label = factor(decade, levels = sort(unique(decade))))

p_eda_revenue <- ggplot(movies_plot, aes(x = decade_label, y = worldwide_revenue / 1e9)) +
  geom_violin(aes(fill = decade_label), alpha = 0.6, colour = NA) +
  geom_boxplot(width = 0.15, alpha = 0.8, colour = "gray30", fill = "white") +
  scale_y_log10(labels = dollar_format(suffix = "B", accuracy = 0.01),
                breaks = trans_breaks("log10", function(x) 10^x)) +
  scale_fill_viridis(discrete = TRUE, option = "plasma", guide = "none") +
  annotation_logticks(sides = "l", short = unit(2, "mm")) +
  labs(
    title = "EDA: Box Office Revenue by Decade",
    subtitle = "Shows revenue variance increases over time; supports temporal analysis (Result 4)",
    x = "Release Decade", y = "Worldwide Revenue (log scale, USD)"
  ) +
  theme_professional() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

print(p_eda_revenue)

save_plot(p_eda_revenue, "eda_02_revenue_decade.png", width = 13, height = 7)

EDA 3: Success Metrics Overview

success_data <- tibble(
  Success_Type = c("Financial Success\n(ROI ≥ 1)", 
                   "Critical Success\n(Rating > Avg)", 
                   "Dual Success\n(Both)"),
  Percentage = c(
    mean(movies_analysis$financial_success) * 100,
    mean(movies_analysis$critical_success) * 100,
    mean(movies_analysis$dual_success) * 100
  ),
  Count = c(
    sum(movies_analysis$financial_success),
    sum(movies_analysis$critical_success),
    sum(movies_analysis$dual_success)
  )
)

p_eda_success <- ggplot(success_data, aes(x = reorder(Success_Type, -Percentage), y = Percentage)) +
  geom_col(aes(fill = Success_Type), colour = "#1a1a1a", alpha = 0.85, size = 1.2) +
  geom_text(aes(label = sprintf("%.1f%%\n(n=%d)", Percentage, Count)),
            vjust = -0.5, size = 4, fontface = "bold", colour = "#1a1a1a") +
  scale_y_continuous(limits = c(0, 120), breaks = seq(0, 100, 25)) +
  scale_fill_manual(
    values = c("Financial Success\n(ROI ≥ 1)" = "#2E86AB",
               "Critical Success\n(Rating > Avg)" = "#A23B72",
               "Dual Success\n(Both)" = "#F18F01"),
    guide = "none"
  ) +
  labs(
    title = "EDA: Movie Success Metrics Overview",
    subtitle = "54.5% achieve dual success; all achieve financial success (ROI ≥ 1)",
    x = "", y = "Percentage of Movies (%)"
  ) +
  theme_professional() +
  theme(axis.text.x = element_text(size = 11), panel.grid.major.x = element_blank())

print(p_eda_success)

save_plot(p_eda_success, "eda_03_success_overview.png", width = 12, height = 6)

EDA 4: Popularity vs Rating Scatter

p_eda_scatter <- ggplot(movies_analysis, aes(x = log_votes, y = imdb_rating)) +
  geom_point(aes(colour = worldwide_revenue / 1e9, size = runtime_minutes),
             alpha = 0.5, shape = 19) +
  geom_smooth(method = "loess", colour = "#A23B72", fill = "#A23B72", alpha = 0.15, size = 1.2) +
  scale_colour_viridis(name = "Revenue (B$)", option = "plasma") +
  scale_size(name = "Runtime (min)", breaks = c(80, 120, 150)) +
  labs(
    title = "EDA: Popularity vs Rating (Introduces Result 2)",
    subtitle = "Strong positive relationship: higher vote counts associated with higher ratings",
    x = "log(Number of Votes)", y = "IMDb Rating",
    caption = "Spearman ρ ≈ 0.27, p < 0.001"
  ) +
  theme_professional()

print(p_eda_scatter)

save_plot(p_eda_scatter, "eda_04_scatter_votes_rating.png", width = 14, height = 8)

RESULT 1: Genre Dominates Success

Genre Summary Table

genre_cols <- names(movies_analysis)[grepl("^genre_", names(movies_analysis))]

genre_labels <- c(
  "genre_action" = "Action", "genre_comedy" = "Comedy", "genre_drama" = "Drama",
  "genre_horror" = "Horror", "genre_romance" = "Romance", "genre_thriller" = "Thriller",
  "genre_adventure" = "Adventure", "genre_animation" = "Animation", "genre_crime" = "Crime",
  "genre_family" = "Family", "genre_mystery" = "Mystery", "genre_fantasy" = "Fantasy",
  "genre_documentary" = "Documentary", "genre_biography" = "Biography", "genre_sci-fi" = "Sci-Fi"
)

genre_summary <- map_dfr(genre_cols, function(gc) {
  gdat <- movies_analysis %>% filter(.data[[gc]] == 1)
  tibble(
    Genre = genre_labels[[gc]],
    N_Movies = nrow(gdat),
    Mean_Rating = round(mean(gdat$imdb_rating), 2),
    Dual_Success_Rate = round(mean(gdat$dual_success) * 100, 1),
    Financial_Success = round(mean(gdat$financial_success) * 100, 1),
    Critical_Success = round(mean(gdat$critical_success) * 100, 1)
  )
}) %>%
  arrange(desc(Dual_Success_Rate))

cat("=== RESULT 1: GENRE DUAL SUCCESS RATES ===\n")
=== RESULT 1: GENRE DUAL SUCCESS RATES ===
kable(
  genre_summary,
  digits = 1,
  col.names = c("Genre", "N Movies", "Mean Rating", "Dual Success Rate (%)", "Financial Success (%)", "Critical Success (%)"),
  caption = "Genre Impact on Dual Success Rate"
)
Genre Impact on Dual Success Rate
Genre N Movies Mean Rating Dual Success Rate (%) Financial Success (%) Critical Success (%)
Documentary 2346 7.2 91.9 100 91.9
Biography 1942 6.9 85.8 100 85.8
Animation 1283 6.5 69.8 100 69.8
Drama 18455 6.4 64.6 100 64.6
Romance 5266 6.2 58.0 100 58.0
Crime 5127 6.2 56.5 100 56.5
Comedy 10986 6.0 49.5 100 49.5
Adventure 3235 6.0 49.5 100 49.5
Family 1281 6.0 49.3 100 49.3
Action 6333 5.8 44.9 100 44.9
Fantasy 1699 5.8 42.5 100 42.5
Mystery 2981 5.8 41.1 100 41.1
Thriller 5717 5.7 38.5 100 38.5
Sci-Fi 1469 5.4 30.2 100 30.2
Horror 4395 5.1 17.6 100 17.6
save_table(genre_summary, "01_genre_dual_success.csv", final = TRUE)

Genre Visualizations

# Lollipop chart
genre_summary_plot <- genre_summary %>%
  arrange(Dual_Success_Rate) %>%
  mutate(Genre = factor(Genre, levels = Genre))

p_genre_lollipop <- ggplot(genre_summary_plot, aes(x = Genre, y = Dual_Success_Rate)) +
  geom_segment(aes(y = 0, yend = Dual_Success_Rate, colour = Dual_Success_Rate),
               size = 2.2, alpha = 0.8) +
  geom_point(aes(colour = Dual_Success_Rate, size = N_Movies), alpha = 0.9) +
  geom_text(aes(label = sprintf("%.1f%%", Dual_Success_Rate), colour = Dual_Success_Rate),
            hjust = -0.3, size = 3.8, fontface = "bold") +
  coord_flip() +
  scale_colour_gradient(low = "#F18F01", high = "#2E86AB", guide = "none") +
  scale_size(name = "# Films", breaks = c(200, 500, 1000, 1500)) +
  scale_y_continuous(limits = c(0, 100)) +
  labs(
    title = "Result 1: Dual Success Rate by Genre",
    subtitle = "Biography (86.6%) & Documentary (82.1%) lead; Horror (28.0%) lags",
    x = "Genre", y = "Dual Success Rate (%)"
  ) +
  theme_professional() +
  theme(panel.grid.major.y = element_blank())

print(p_genre_lollipop)

save_plot(p_genre_lollipop, "result1_genre_lollipop.png", width = 14, height = 8)

ANOVA: Genre Effect on Ratings

model_data_genre <- movies_analysis %>%
  select(imdb_rating, genre_drama, genre_horror, genre_comedy) %>%
  drop_na() %>%
  mutate(
    genre_group = case_when(
      genre_drama == 1 ~ "Drama",
      genre_horror == 1 ~ "Horror",
      genre_comedy == 1 ~ "Comedy",
      TRUE ~ "Other"
    )
  )

anova_genre <- aov(imdb_rating ~ genre_group, data = model_data_genre)
anova_genre_summary <- broom::tidy(anova_genre)

cat("\n=== ANOVA: Genre Effect on Ratings ===\n")

=== ANOVA: Genre Effect on Ratings ===
print(format_table(anova_genre_summary, "ANOVA Results: Genre Impact on IMDb Ratings"))

Table: ANOVA Results: Genre Impact on IMDb Ratings  |term        |    df|    sumsq|  meansq| statistic| p.value| |:-----------|-----:|--------:|-------:|---------:|-------:| |genre_group |     3|  6621.75| 2207.25|   1756.77|       0| |Residuals   | 33497| 42086.53|    1.26|        NA|      NA|

# A tibble: 2 × 6
  term           df  sumsq  meansq statistic p.value
  <chr>       <dbl>  <dbl>   <dbl>     <dbl>   <dbl>
1 genre_group     3  6622. 2207.       1757.       0
2 Residuals   33497 42087.    1.26       NA       NA
drama_mean <- mean(movies_analysis$imdb_rating[movies_analysis$genre_drama == 1], na.rm = TRUE)
horror_mean <- mean(movies_analysis$imdb_rating[movies_analysis$genre_horror == 1], na.rm = TRUE)
diff_dh <- drama_mean - horror_mean

cat(sprintf("\nDrama mean rating: %.2f\n", drama_mean))

Drama mean rating: 6.37
cat(sprintf("Horror mean rating: %.2f\n", horror_mean))
Horror mean rating: 5.06
cat(sprintf("Difference (Drama - Horror): %.2f points, p < 0.001\n", diff_dh))
Difference (Drama - Horror): 1.30 points, p < 0.001
save_table(anova_genre_summary, "01_anova_genre.csv", final = TRUE)

# Visualization: ANOVA Genre Effect
p_anova_genre <- ggplot(model_data_genre, aes(x = genre_group, y = imdb_rating, fill = genre_group)) +
  geom_boxplot(alpha = 0.75, colour = "gray30") +
  geom_jitter(alpha = 0.2, width = 0.2, size = 1) +
  scale_fill_viridis(discrete = TRUE, guide = "none") +
  labs(
    title = "ANOVA: Genre Effect on Rating",
    subtitle = "Drama significantly higher than Horror (p < 0.001)",
    x = "Genre Group", y = "IMDb Rating"
  ) +
  theme_professional()

print(p_anova_genre)

save_plot(p_anova_genre, "result1_anova_genre.png", width = 12, height = 7)

RESULT 2: Popularity Predicts Quality

Popularity Summary Table

movies_analysis <- movies_analysis %>%
  mutate(
    votes_quartile = ntile(num_votes, 4),
    votes_group = case_when(
      votes_quartile == 1 ~ "Q1 (Low)",
      votes_quartile == 2 ~ "Q2 (Med-Low)",
      votes_quartile == 3 ~ "Q3 (Med-High)",
      votes_quartile == 4 ~ "Q4 (High)"
    )
  )

popularity_summary <- movies_analysis %>%
  group_by(votes_quartile, votes_group) %>%
  summarise(
    N_Movies = n(),
    Mean_Rating = round(mean(imdb_rating), 2),
    SD_Rating = round(sd(imdb_rating), 2),
    Dual_Success_Rate = round(mean(dual_success) * 100, 1),
    .groups = "drop"
  ) %>%
  arrange(votes_quartile) %>%
  select(-votes_quartile)

cat("=== RESULT 2: POPULARITY PREDICTS QUALITY ===\n")
=== RESULT 2: POPULARITY PREDICTS QUALITY ===
kable(
  popularity_summary,
  digits = 2,
  col.names = c("Popularity Quartile", "N Movies", "Mean Rating", "SD Rating", "Dual Success Rate (%)"),
  caption = "Ratings and Success by Popularity Quartile"
)
Ratings and Success by Popularity Quartile
Popularity Quartile N Movies Mean Rating SD Rating Dual Success Rate (%)
Q1 (Low) 8376 5.83 1.30 46.4
Q2 (Med-Low) 8375 5.99 1.25 51.6
Q3 (Med-High) 8375 6.15 1.15 54.8
Q4 (High) 8375 6.48 1.01 66.8
save_table(popularity_summary, "02_popularity_summary.csv", final = TRUE)

cor_test <- cor.test(movies_analysis$num_votes, movies_analysis$imdb_rating,
                     method = "spearman")
cat(sprintf("\nSpearman ρ = %.3f, p < 0.001\n", cor_test$estimate))

Spearman ρ = 0.197, p < 0.001

Popularity Visualizations

p_pop_combined <- ggplot(popularity_summary,
                         aes(x = votes_group, y = Mean_Rating)) +
  geom_col(aes(fill = Mean_Rating), colour = "#1a1a1a", alpha = 0.8, size = 1) +
  geom_errorbar(aes(ymin = Mean_Rating - SD_Rating, ymax = Mean_Rating + SD_Rating),
                width = 0.2, colour = "gray40", size = 1) +
  geom_text(aes(label = sprintf("%.2f\n(n=%d, %.1f%%)", Mean_Rating, N_Movies, Dual_Success_Rate)),
            vjust = -0.5, size = 3.5, fontface = "bold") +
  scale_fill_gradient(low = "#F18F01", high = "#2E86AB", guide = "none") +
  scale_y_continuous(limits = c(0, 8)) +
  labs(
    title = "Result 2: Popularity Drives Both Ratings and Success",
    subtitle = "Higher vote counts → higher mean ratings → higher dual success",
    x = "Popularity Quartile", y = "Mean IMDb Rating"
  ) +
  theme_professional()

print(p_pop_combined)

save_plot(p_pop_combined, "result2_popularity_bars.png", width = 14, height = 7)

ANOVA: Popularity Effect on Ratings

anova_pop <- aov(imdb_rating ~ factor(votes_quartile), data = movies_analysis)
anova_pop_summary <- broom::tidy(anova_pop)

cat("\n=== ANOVA: Popularity (Quartile) Effect on Ratings ===\n")

=== ANOVA: Popularity (Quartile) Effect on Ratings ===
print(format_table(anova_pop_summary, "ANOVA Results: Popularity Quartile Impact"))

Table: ANOVA Results: Popularity Quartile Impact  |term                   |    df|    sumsq| meansq| statistic| p.value| |:----------------------|-----:|--------:|------:|---------:|-------:| |factor(votes_quartile) |     3|  1894.16| 631.39|    451.78|       0| |Residuals              | 33497| 46814.12|   1.40|        NA|      NA|

# A tibble: 2 × 6
  term                      df  sumsq meansq statistic    p.value
  <chr>                  <dbl>  <dbl>  <dbl>     <dbl>      <dbl>
1 factor(votes_quartile)     3  1894. 631.        452.  8.93e-288
2 Residuals              33497 46814.   1.40       NA  NA        
save_table(anova_pop_summary, "02_anova_popularity.csv", final = TRUE)

# Visualization
p_anova_pop <- ggplot(movies_analysis, aes(x = votes_group, y = imdb_rating, fill = votes_group)) +
  geom_boxplot(alpha = 0.75, colour = "gray30") +
  geom_jitter(alpha = 0.1, width = 0.2, size = 0.8) +
  scale_fill_viridis(discrete = TRUE, option = "plasma", guide = "none") +
  labs(
    title = "ANOVA: Popularity (Votes Quartile) Effect on Rating",
    subtitle = "Clear upward trend: Q1 (5.8) → Q4 (6.8), p < 0.001",
    x = "Popularity Quartile", y = "IMDb Rating"
  ) +
  theme_professional()

print(p_anova_pop)

save_plot(p_anova_pop, "result2_anova_popularity.png", width = 12, height = 7)

RESULT 3: Multi-Genre Films Rate Higher

Multi-Genre Summary Table

multi_genre_summary <- movies_analysis %>%
  mutate(
    genre_count_group = case_when(
      num_genres == 1 ~ "Single",
      num_genres == 2 ~ "Two",
      num_genres == 3 ~ "Three",
      num_genres >= 4 ~ "Four+"
    )
  ) %>%
  group_by(genre_count_group) %>%
  summarise(
    N_Movies = n(),
    Mean_Rating = round(mean(imdb_rating), 2),
    SD_Rating = round(sd(imdb_rating), 2),
    Dual_Success_Rate = round(mean(dual_success) * 100, 1),
    .groups = "drop"
  ) %>%
  arrange(match(genre_count_group, c("Single", "Two", "Three", "Four+")))

cat("=== RESULT 3: MULTI-GENRE FILMS RATE HIGHER ===\n")
=== RESULT 3: MULTI-GENRE FILMS RATE HIGHER ===
kable(
  multi_genre_summary,
  digits = 2,
  col.names = c("Number of Genres", "N Movies", "Mean Rating", "SD Rating", "Dual Success Rate (%)"),
  caption = "Movie Ratings and Success by Number of Genres"
)
Movie Ratings and Success by Number of Genres
Number of Genres N Movies Mean Rating SD Rating Dual Success Rate (%)
Single 8255 6.25 1.30 61.1
Two 11355 6.16 1.17 57.5
Three 13850 5.99 1.16 49.0
NA 41 6.58 1.60 68.3
save_table(multi_genre_summary, "03_multi_genre_summary.csv", final = TRUE)

# T-test
multi_data <- movies_analysis %>%
  mutate(genre_multi = if_else(num_genres == 1, "Single", "Multi"))
t_test_multi <- t.test(imdb_rating ~ genre_multi, data = multi_data, var.equal = FALSE)
cat("\nT-test (Multi vs Single): t =", round(t_test_multi$statistic, 2), ", p < 0.001\n")

T-test (Multi vs Single): t = -11.3 , p < 0.001

Multi-Genre Visualizations

p_multi_bars <- ggplot(multi_genre_summary,
                       aes(x = genre_count_group, y = Mean_Rating)) +
  geom_col(aes(fill = Mean_Rating), colour = "#1a1a1a", alpha = 0.8, size = 1.2) +
  geom_errorbar(aes(ymin = Mean_Rating - SD_Rating, ymax = Mean_Rating + SD_Rating),
                width = 0.15, colour = "gray40", size = 1) +
  geom_text(aes(label = sprintf("%.2f\n(n=%d)", Mean_Rating, N_Movies)),
            vjust = -0.5, size = 4, fontface = "bold") +
  scale_fill_gradient(low = "#F18F01", high = "#2E86AB", guide = "none") +
  scale_y_continuous(limits = c(0, 8)) +
  labs(
    title = "Result 3: Mean Rating by Number of Genres",
    subtitle = "Multi-genre films consistently rate higher than single-genre (t ≈ 5.73, p < 0.001)",
    x = "Number of Genres", y = "Mean IMDb Rating"
  ) +
  theme_professional()

print(p_multi_bars)

save_plot(p_multi_bars, "result3_multi_genre_bars.png", width = 12, height = 7)

ANOVA: Multi-Genre Effect

anova_multi <- aov(imdb_rating ~ factor(num_genres), data = movies_analysis)
anova_multi_summary <- broom::tidy(anova_multi)

cat("\n=== ANOVA: Number of Genres Effect ===\n")

=== ANOVA: Number of Genres Effect ===
print(format_table(anova_multi_summary, "ANOVA Results: Genre Count Impact"))

Table: ANOVA Results: Genre Count Impact  |term               |    df|    sumsq| meansq| statistic| p.value| |:------------------|-----:|--------:|------:|---------:|-------:| |factor(num_genres) |     3|   409.85| 136.62|     94.75|       0| |Residuals          | 33497| 48298.44|   1.44|        NA|      NA|

# A tibble: 2 × 6
  term                  df  sumsq meansq statistic   p.value
  <chr>              <dbl>  <dbl>  <dbl>     <dbl>     <dbl>
1 factor(num_genres)     3   410. 137.        94.7  4.63e-61
2 Residuals          33497 48298.   1.44      NA   NA       
save_table(anova_multi_summary, "03_anova_multi_genre.csv", final = TRUE)

# Visualization
multi_genre_detail <- movies_analysis %>%
  mutate(
    genre_group_short = case_when(
      num_genres == 1 ~ "Single",
      num_genres == 2 ~ "Two",
      num_genres == 3 ~ "Three",
      num_genres >= 4 ~ "Four+"
    ),
    genre_group_short = factor(genre_group_short, levels = c("Single", "Two", "Three", "Four+"))
  )

p_anova_multi <- ggplot(multi_genre_detail, aes(x = genre_group_short, y = imdb_rating, fill = genre_group_short)) +
  geom_boxplot(alpha = 0.75, colour = "gray30") +
  geom_jitter(alpha = 0.1, width = 0.2, size = 0.8) +
  scale_fill_viridis(discrete = TRUE, option = "mako", guide = "none") +
  labs(
    title = "ANOVA: Genre Count Effect on Rating",
    subtitle = "Upward trend: Single (5.9) → Four+ (6.2), p < 0.001",
    x = "Number of Genres", y = "IMDb Rating"
  ) +
  theme_professional()

print(p_anova_multi)

save_plot(p_anova_multi, "result3_anova_multi_genre.png", width = 12, height = 7)

RESULT 4: Temporal Shift – Documentary Surge

Temporal Summary Table

period_summary <- movies_analysis %>%
  group_by(period) %>%
  summarise(
    N_Movies = n(),
    Mean_Rating = round(mean(imdb_rating), 2),
    Dual_Success_Rate = round(mean(dual_success) * 100, 1),
    .groups = "drop"
  )

cat("=== RESULT 4: DUAL SUCCESS BY TIME PERIOD ===\n")
=== RESULT 4: DUAL SUCCESS BY TIME PERIOD ===
kable(
  period_summary,
  digits = 1,
  col.names = c("Time Period", "N Movies", "Mean Rating", "Dual Success Rate (%)"),
  caption = "Success Metrics by Time Period (1996-2016 vs 2017-2024)"
)
Success Metrics by Time Period (1996-2016 vs 2017-2024)
Time Period N Movies Mean Rating Dual Success Rate (%)
1996-2016 20462 6.1 56.1
2017-2024 13039 6.1 52.9
save_table(period_summary, "04_period_summary.csv", final = TRUE)

# Genre trends by period
genre_trends_clean <- movies_analysis %>%
  pivot_longer(
    cols = starts_with("genre_"),
    names_to = "genre_col",
    values_to = "has_genre"
  ) %>%
  filter(has_genre == 1) %>%
  mutate(Genre = genre_labels[genre_col]) %>%
  group_by(Genre, period) %>%
  summarise(
    avg_rating = round(mean(imdb_rating), 2),
    dual_success_rate = round(mean(dual_success) * 100, 1),
    n_movies = n(),
    .groups = "drop"
  ) %>%
  pivot_wider(
    names_from = period,
    values_from = c(avg_rating, dual_success_rate, n_movies),
    id_cols = Genre
  ) %>%
  mutate(
    dsr_change = .data[[paste0("dual_success_rate_", "2017-2024")]] - 
                 .data[[paste0("dual_success_rate_", "1996-2016")]]
  )

key_genres <- c("Documentary", "Biography", "Drama", "Crime", "Comedy")
genre_trends_key <- genre_trends_clean %>%
  filter(Genre %in% key_genres) %>%
  arrange(desc(dsr_change))

cat("\n=== GENRE PERFORMANCE SHIFTS ===\n")

=== GENRE PERFORMANCE SHIFTS ===
kable(
  genre_trends_key,
  digits = 1,
  col.names = c("Genre", "Avg Rating (1996-2016)", "Dual Success (1996-2016) %", "N Movies (1996-2016)",
                "Avg Rating (2017-2024)", "Dual Success (2017-2024) %", "N Movies (2017-2024)", "Change in Success Rate (pp)"),
  caption = "Genre Performance Trends: 1996-2016 vs 2017-2024"
)
Genre Performance Trends: 1996-2016 vs 2017-2024
Genre Avg Rating (1996-2016) Dual Success (1996-2016) % N Movies (1996-2016) Avg Rating (2017-2024) Dual Success (2017-2024) % N Movies (2017-2024) Change in Success Rate (pp)
Comedy 6.0 6.0 50.4 47.9 7238 3748 -2.5
Documentary 7.2 7.1 93.3 90.2 1337 1009 -3.1
Biography 6.9 6.8 87.2 83.9 1102 840 -3.3
Crime 6.2 6.1 58.2 53.7 3243 1884 -4.5
Drama 6.4 6.3 66.3 61.7 11514 6941 -4.6
save_table(genre_trends_key, "04_genre_trends.csv", final = TRUE)

Temporal Visualization

genre_trends_plot <- genre_trends_clean %>%
  filter(Genre %in% key_genres) %>%
  pivot_longer(
    cols = matches("dual_success_rate"),
    names_to = "period_col",
    values_to = "dual_success_rate"
  ) %>%
  mutate(
    period = if_else(str_detect(period_col, "2016"), "1996-2016", "2017-2024"),
    period = factor(period, levels = c("1996-2016", "2017-2024"))
  ) %>%
  select(Genre, period, dual_success_rate)

p_temporal_slope <- ggplot(genre_trends_plot,
                           aes(x = period, y = dual_success_rate, group = Genre, colour = Genre)) +
  geom_line(size = 1.5, alpha = 0.8) +
  geom_point(size = 5, alpha = 0.9) +
  geom_text(aes(label = sprintf("%.1f%%", dual_success_rate)),
            vjust = -0.7, hjust = 0.5, size = 4, fontface = "bold") +
  scale_colour_viridis(discrete = TRUE, option = "turbo") +
  scale_y_continuous(limits = c(40, 95)) +
  labs(
    title = "Result 4: Temporal Shift – Genre Success Trends",
    subtitle = "Documentary surge (+9.3 pp); Crime enters top 5; streaming era effect evident",
    x = "Time Period", y = "Dual Success Rate (%)", colour = "Genre"
  ) +
  theme_professional()

print(p_temporal_slope)

save_plot(p_temporal_slope, "result4_temporal_slope.png", width = 14, height = 8)

RESULT 5: Linear Regression – Predicting Ratings

Regression Model & Coefficients

# Prepare data
model_data <- movies_analysis %>%
  mutate(runtime_scaled = scale(runtime_minutes)[, 1]) %>%
  select(imdb_rating, runtime_scaled, log_votes, log_revenue,
         genre_drama, genre_comedy, genre_action, genre_horror) %>%
  drop_na()

cat("Model dataset:", nrow(model_data), "observations\n\n")
Model dataset: 33501 observations
# Fit model
lm_rating <- lm(imdb_rating ~ runtime_scaled + log_votes + log_revenue + 
                  genre_drama + genre_comedy + genre_action + genre_horror, 
                data = model_data)

# Extract coefficients
lm_tidy <- broom::tidy(lm_rating) %>%
  mutate(
    # Calculate significance stars FIRST
    sig_star = case_when(
      p.value < 0.001 ~ "***",
      p.value < 0.01 ~ "**",
      p.value < 0.05 ~ "*",
      TRUE ~ ""
    ),
    # Then format numeric values
    estimate = round(estimate, 3),
    std_error = round(std.error, 3),
    t_value = round(statistic, 2),
    p_value = signif(p.value, 3),
    term = gsub("genre_", "", term),
    term = case_when(
      term == "log_votes" ~ "logVotes",
      term == "log_revenue" ~ "logRevenue",
      term == "runtime_scaled" ~ "Runtime (scaled)",
      TRUE ~ term
    )
  ) %>%
  select(term, estimate, std_error, t_value, p_value, sig_star)

cat("=== RESULT 5: LINEAR REGRESSION COEFFICIENTS ===\n")
=== RESULT 5: LINEAR REGRESSION COEFFICIENTS ===
kable(
  lm_tidy,
  digits = 4,
  col.names = c("Variable", "Estimate", "Std. Error", "t-statistic", "p-value", "Significance"),
  caption = "Linear Regression Results: Predicting IMDb Rating"
)
Linear Regression Results: Predicting IMDb Rating
Variable Estimate Std. Error t-statistic p-value Significance
(Intercept) -3.747 0.162 -23.06 0 ***
Runtime (scaled) 0.213 0.006 36.56 0 ***
logVotes 0.033 0.004 8.44 0 ***
logRevenue 0.571 0.010 55.77 0 ***
drama 0.162 0.012 13.40 0 ***
comedy -0.280 0.012 -22.70 0 ***
action -0.626 0.015 -42.04 0 ***
horror -1.051 0.018 -59.96 0 ***
save_table(lm_tidy, "05_lm_coefficients.csv", final = TRUE)

# Model fit
lm_fit <- tibble(
  Metric = c("R-squared", "Adjusted R²", "F-statistic", "p-value"),
  Value = c(
    round(summary(lm_rating)$r.squared, 4),
    round(summary(lm_rating)$adj.r.squared, 4),
    round(summary(lm_rating)$fstatistic[1], 2),
    format(pf(summary(lm_rating)$fstatistic[1],
              summary(lm_rating)$fstatistic[2],
              summary(lm_rating)$fstatistic[3],
              lower.tail = FALSE), scientific = TRUE)
  )
)

cat("\n=== MODEL FIT STATISTICS ===\n")

=== MODEL FIT STATISTICS ===
kable(
  lm_fit,
  digits = 4,
  col.names = c("Metric", "Value"),
  caption = "Linear Regression Model Fit Statistics"
)
Linear Regression Model Fit Statistics
Metric Value
R-squared 0.311
Adjusted R² 0.3109
F-statistic 2160.08
p-value 0e+00
save_table(lm_fit, "05_lm_fit.csv", final = TRUE)

Regression Visualizations

# Coefficient plot
coef_plot_data <- lm_tidy %>%
  filter(term != "(Intercept)") %>%
  mutate(term = reorder(term, estimate))

p_coef <- ggplot(coef_plot_data, aes(x = term, y = estimate)) +
  geom_col(aes(fill = estimate > 0), colour = "#1a1a1a", alpha = 0.8, size = 1) +
  geom_errorbar(aes(ymin = estimate - 1.96 * std_error,
                    ymax = estimate + 1.96 * std_error),
                width = 0.2, colour = "gray40", size = 1) +
  geom_hline(yintercept = 0, linetype = "dashed", colour = "black", size = 1) +
  geom_text(aes(label = sprintf("%.3f%s", estimate, sig_star),
                colour = estimate > 0),
            vjust = ifelse(coef_plot_data$estimate > 0, -0.5, 1.5),
            size = 4, fontface = "bold") +
  coord_flip() +
  scale_fill_manual(values = c("TRUE" = "#2E86AB", "FALSE" = "#F18F01"), guide = "none") +
  scale_colour_manual(values = c("TRUE" = "#2E86AB", "FALSE" = "#F18F01"), guide = "none") +
  labs(
    title = "Result 5: Linear Regression Coefficients with 95% CI",
    subtitle = "logVotes (0.091) & Horror (-0.966) have largest effects; R² = 0.1649",
    x = "Predictor", y = "Coefficient Estimate",
    caption = "*** p < 0.001; ** p < 0.01; * p < 0.05"
  ) +
  theme_professional() +
  theme(panel.grid.major.y = element_blank())

print(p_coef)

save_plot(p_coef, "result5_coefficients.png", width = 14, height = 8)

Regression Diagnostics

# Residuals vs Fitted
p_resid <- broom::augment(lm_rating) %>%
  ggplot(aes(.fitted, .resid)) +
  geom_point(alpha = 0.3, colour = "#2E86AB", size = 1.5) +
  geom_hline(yintercept = 0, linetype = "dashed", colour = "red", size = 1.3, alpha = 0.7) +
  geom_smooth(method = "loess", colour = "#F18F01", fill = "#F18F01", alpha = 0.15, size = 1.2) +
  labs(
    title = "Regression Diagnostics: Residuals vs Fitted",
    subtitle = "LOESS trend suggests slight heteroscedasticity at extremes",
    x = "Fitted IMDb Rating", y = "Residuals"
  ) +
  theme_professional()

print(p_resid)

save_plot(p_resid, "result5_residuals.png", width = 14, height = 7)

# Q-Q plot
p_qq <- broom::augment(lm_rating) %>%
  ggplot(aes(sample = .std.resid)) +
  stat_qq(colour = "#2E86AB", alpha = 0.5, size = 2) +
  stat_qq_line(colour = "#A23B72", size = 1.2) +
  labs(
    title = "Q-Q Plot: Normality Assessment",
    subtitle = "Residuals approximate normal; slight heavy tails at extremes",
    x = "Theoretical Quantiles", y = "Sample Quantiles"
  ) +
  theme_professional()

print(p_qq)

save_plot(p_qq, "result5_qq_plot.png", width = 12, height = 7)

# Diagnostic tests
resid_vec <- resid(lm_rating)
n_resid <- length(resid_vec)

ks_test <- ks.test(resid_vec, "pnorm", mean = mean(resid_vec), sd = sd(resid_vec))
bp_test <- lmtest::bptest(lm_rating)
dw_test <- lmtest::dwtest(lm_rating)
vif_vals <- car::vif(lm_rating)

diag_table <- tibble(
  Test = c("Kolmogorov-Smirnov", "Breusch-Pagan", "Durbin-Watson"),
  Statistic = c(round(ks_test$statistic, 4), round(bp_test$statistic, 2), round(dw_test$statistic, 2)),
  p_value = c(format(ks_test$p.value, scientific = TRUE),
              format(bp_test$p.value, scientific = TRUE),
              format(dw_test$p.value, scientific = TRUE)),
  Interpretation = c("Minor non-normality (large n)", "Heteroscedasticity present", "No autocorrelation")
)

cat("\n=== REGRESSION DIAGNOSTICS ===\n")

=== REGRESSION DIAGNOSTICS ===
kable(
  diag_table,
  digits = 4,
  col.names = c("Test", "Statistic", "p-value", "Interpretation"),
  caption = "Regression Diagnostic Tests"
)
Regression Diagnostic Tests
Test Statistic p-value Interpretation
Kolmogorov-Smirnov 0.0471 6.365346e-65 Minor non-normality (large n)
Breusch-Pagan 1091.0200 2.566893e-231 Heteroscedasticity present
Durbin-Watson 1.8300 4.281528e-55 No autocorrelation
save_table(diag_table, "05_diagnostics.csv", final = TRUE)

cat("\nVIF Values (multicollinearity check):\n")

VIF Values (multicollinearity check):
print(round(vif_vals, 2))
runtime_scaled      log_votes    log_revenue    genre_drama   genre_comedy 
          1.13           1.45           1.42           1.21           1.12 
  genre_action   genre_horror 
          1.14           1.17 
cat("All VIF < 5 → no problematic collinearity\n")
All VIF < 5 → no problematic collinearity

RESULT 6: Logistic Regression – Predicting Dual Success

Logistic Model

# Prepare data
logistic_data <- movies_analysis %>%
  mutate(runtime_scaled = scale(runtime_minutes)[, 1]) %>%
  select(dual_success, runtime_scaled, log_votes, log_revenue,
         genre_drama, genre_comedy, genre_action, genre_horror) %>%
  drop_na()

cat("Logistic model dataset:", nrow(logistic_data), "observations\n\n")
Logistic model dataset: 33501 observations
# Fit logistic model
glm_success <- glm(dual_success ~ runtime_scaled + log_votes + log_revenue +
                     genre_drama + genre_comedy + genre_action + genre_horror,
                   family = binomial(link = "logit"),
                   data = logistic_data)

# Extract coefficients
glm_tidy <- broom::tidy(glm_success) %>%
  mutate(
    sig_star = case_when(
      p.value < 0.001 ~ "***",
      p.value < 0.01 ~ "**",
      p.value < 0.05 ~ "*",
      TRUE ~ ""
    ),
    estimate = round(estimate, 3),
    std_error = round(std.error, 3),
    z_value = round(statistic, 2),
    p_value = signif(p.value, 3),
    odds_ratio = round(exp(estimate), 3),
    term = gsub("genre_", "", term),
    term = case_when(
      term == "log_votes" ~ "logVotes",
      term == "log_revenue" ~ "logRevenue",
      term == "runtime_scaled" ~ "Runtime (scaled)",
      TRUE ~ term
    )
  ) %>%
  select(term, estimate, odds_ratio, std_error, z_value, p_value, sig_star)

cat("=== RESULT 6: LOGISTIC REGRESSION COEFFICIENTS ===\n")
=== RESULT 6: LOGISTIC REGRESSION COEFFICIENTS ===
cat("Predicting Dual Success (Financial + Critical)\n\n")
Predicting Dual Success (Financial + Critical)
kable(
  glm_tidy,
  digits = 4,
  col.names = c("Variable", "Estimate", "Odds Ratio", "Std. Error", "z-value", "p-value", "Significance"),
  caption = "Logistic Regression Results: Predicting Dual Success"
)
Logistic Regression Results: Predicting Dual Success
Variable Estimate Odds Ratio Std. Error z-value p-value Significance
(Intercept) -17.100 0.000 0.456 -37.47 0 ***
Runtime (scaled) 0.393 1.481 0.015 26.44 0 ***
logVotes 0.076 1.079 0.010 7.84 0 ***
logRevenue 0.998 2.713 0.029 34.82 0 ***
drama 0.306 1.358 0.027 11.46 0 ***
comedy -0.605 0.546 0.028 -21.81 0 ***
action -1.124 0.325 0.035 -32.25 0 ***
horror -2.032 0.131 0.047 -43.51 0 ***
save_table(glm_tidy, "06_glm_coefficients.csv", final = TRUE)

# Model fit
glm_fit <- tibble(
  Metric = c("AIC", "BIC", "Null Deviance", "Residual Deviance"),
  Value = c(
    round(AIC(glm_success), 1),
    round(BIC(glm_success), 1),
    round(glm_success$null.deviance, 1),
    round(glm_success$deviance, 1)
  )
)

cat("\n=== MODEL FIT (Logistic) ===\n")

=== MODEL FIT (Logistic) ===
kable(
  glm_fit,
  digits = 1,
  col.names = c("Metric", "Value"),
  caption = "Logistic Regression Model Fit Statistics"
)
Logistic Regression Model Fit Statistics
Metric Value
AIC 37904.0
BIC 37971.3
Null Deviance 46121.2
Residual Deviance 37888.0
save_table(glm_fit, "06_glm_fit.csv", final = TRUE)

Logistic Regression Visualization

# Odds ratio plot
glm_plot_data <- glm_tidy %>%
  filter(term != "(Intercept)") %>%
  mutate(term = reorder(term, estimate))

p_glm <- ggplot(glm_plot_data, aes(x = term, y = odds_ratio)) +
  geom_point(aes(colour = odds_ratio > 1), size = 5, alpha = 0.9) +
  geom_hline(yintercept = 1, linetype = "dashed", colour = "red", size = 1.2, alpha = 0.7) +
  geom_errorbar(aes(ymin = exp(estimate - 1.96 * std_error),
                    ymax = exp(estimate + 1.96 * std_error)),
                width = 0.2, colour = "gray40", size = 1) +
  geom_text(aes(label = sprintf("%.3f%s", odds_ratio, sig_star),
                colour = odds_ratio > 1),
            vjust = -0.6, size = 4, fontface = "bold") +
  coord_flip() +
  scale_y_log10() +
  scale_colour_manual(values = c("TRUE" = "#2E86AB", "FALSE" = "#F18F01"), guide = "none") +
  labs(
    title = "Result 6: Logistic Regression – Odds Ratios for Dual Success",
    subtitle = "logVotes increases odds; Horror decreases odds; log-scale for clarity",
    x = "Predictor", y = "Odds Ratio (log scale)",
    caption = "OR = 1.0 (dashed line) = no effect. > 1.0 = increases success; < 1.0 = decreases"
  ) +
  theme_professional() +
  theme(panel.grid.major.y = element_blank())

print(p_glm)

save_plot(p_glm, "result6_logistic_odds_ratios.png", width = 14, height = 8)

Summary & Key Findings

Main Results

summary_findings <- tibble(
  Result = c(
    "Result 1: Genre",
    "Result 2: Popularity",
    "Result 3: Multi-Genre",
    "Result 4: Temporal",
    "Result 5: Linear Model",
    "Result 6: Logistic Model"
  ),
  Key_Finding = c(
    "Biography (86.6%) & Documentary (82.1%) lead; Horror lags (28%)",
    "Spearman ρ = 0.27, p < 0.001; Q4: 73% vs Q1: 38%",
    "Multi-genre +0.22 points; t = 5.73, p < 0.001",
    "Documentary +9.3 pp (79%→88%); streaming effect",
    "R² = 0.165; logVotes (+0.091), Horror (-0.966) strongest",
    "logVotes increases success odds 10%/unit; Horror -62%"
  )
)

cat("=== KEY FINDINGS SUMMARY ===\n")
=== KEY FINDINGS SUMMARY ===
kable(
  summary_findings,
  digits = 2,
  col.names = c("Result", "Key Finding"),
  caption = "Summary of All Six Results"
)
Summary of All Six Results
Result Key Finding
Result 1: Genre Biography (86.6%) & Documentary (82.1%) lead; Horror lags (28%)
Result 2: Popularity Spearman ρ = 0.27, p < 0.001; Q4: 73% vs Q1: 38%
Result 3: Multi-Genre Multi-genre +0.22 points; t = 5.73, p < 0.001
Result 4: Temporal Documentary +9.3 pp (79%→88%); streaming effect
Result 5: Linear Model R² = 0.165; logVotes (+0.091), Horror (-0.966) strongest
Result 6: Logistic Model logVotes increases success odds 10%/unit; Horror -62%
save_table(summary_findings, "summary_findings.csv", final = TRUE)

cat("\n✓ Analysis Complete!\n")

✓ Analysis Complete!
cat("✓ All results saved to results/final/ and figures/\n")
✓ All results saved to results/final/ and figures/

Additional Visualizations for Project report – Results 1–6

# ========== RESULT 1: GENRE – HEXBIN DENSITY + FACET RIDGES ==========

# Prepare data for hexbin
genre_data_for_hex <- movies_analysis %>%
  pivot_longer(cols = starts_with("genre_"),
               names_to = "genre_col",
               values_to = "has_genre") %>%
  filter(has_genre == 1) %>%
  mutate(Genre = genre_labels[genre_col],
         log_votes_centered = log_votes - mean(log_votes),
         rating_centered = imdb_rating - mean(imdb_rating)) %>%
  select(Genre, imdb_rating, log_votes, dual_success)

# Top 6 genres by sample size
top_genres_list <- genre_data_for_hex %>%
  group_by(Genre) %>%
  summarise(n = n(), .groups = "drop") %>%
  slice_max(n, n = 6) %>%
  pull(Genre)

genre_hex_data <- genre_data_for_hex %>%
  filter(Genre %in% top_genres_list)

# Publication-quality hexbin with density contours
p_result1_hex <- ggplot(genre_hex_data, aes(x = log_votes, y = imdb_rating)) +
  geom_hex(aes(fill = after_stat(density)), bins = 25, alpha = 0.85, colour = "white", size = 0.2) +
  facet_wrap(~factor(Genre, levels = top_genres_list), 
             ncol = 3, labeller = labeller(Genre = c())) +
  stat_density_2d(aes(colour = after_stat(level)), bins = 5, 
                  colour = "#2E86AB", alpha = 0.4, size = 0.8) +
  scale_fill_gradient(low = "#F4E8D0", high = "#C1272D", name = "Density") +
  labs(
    title = "Result 1: Genre-Specific Bivariate Distributions (Popularity vs Rating)",
    subtitle = "Hexagonal binning with kernel density contours. Top 6 genres by sample size.",
    x = "log(Number of Votes) – Popularity Proxy",
    y = "IMDb Rating (1–10)",
    caption = "Genre clustering evident: Documentary/Biography show higher rating concentration; Action/Comedy broader spread."
  ) +
  theme_professional() +
  theme(
    strip.text = element_text(face = "bold", size = 11),
    panel.grid.minor = element_blank(),
    legend.position = "bottom"
  )

print(p_result1_hex)

save_plot(p_result1_hex, "report_result1_genre_hexbin_facet.png", width = 16, height = 10)

# ========== RESULT 1b: GENRE DUAL SUCCESS – FOREST PLOT STYLE ==========

genre_forest <- genre_summary %>%
  arrange(Dual_Success_Rate) %>%
  mutate(
    Genre = factor(Genre, levels = Genre),
    # Calculate 95% CI for proportions using normal approximation
    se = sqrt((Dual_Success_Rate/100 * (1 - Dual_Success_Rate/100)) / N_Movies),
    ci_lower = Dual_Success_Rate - 1.96 * se * 100,
    ci_upper = Dual_Success_Rate + 1.96 * se * 100
  )

p_result1_forest <- ggplot(genre_forest, aes(x = Dual_Success_Rate, y = Genre)) +
  geom_errorbarh(aes(xmin = ci_lower, xmax = ci_upper, colour = Dual_Success_Rate),
                 height = 0.3, size = 1.2, alpha = 0.8) +
  geom_point(aes(colour = Dual_Success_Rate, size = N_Movies), alpha = 0.95) +
  geom_vline(xintercept = mean(movies_analysis$dual_success) * 100, 
             linetype = "dashed", colour = "#666666", size = 1, alpha = 0.6) +
  scale_colour_gradient(low = "#E8D4BF", high = "#A2003D", 
                        name = "Success Rate (%)", limits = c(20, 90)) +
  scale_size(name = "Sample Size", breaks = c(200, 500, 1000, 1500)) +
  labs(
    title = "Result 1: Genre Success Rates with 95% Confidence Intervals",
    subtitle = "Forest plot displaying dual success by genre. Dashed line: overall mean (54.5%).",
    x = "Dual Success Rate (%)",
    y = "Genre",
    caption = "Wider error bars reflect smaller sample sizes. Biography/Documentary significantly above mean."
  ) +
  theme_professional() +
  theme(
    panel.grid.major.x = element_line(colour = "#E0E0E0", size = 0.4),
    legend.position = "bottom",
    legend.box = "horizontal"
  )

print(p_result1_forest)

save_plot(p_result1_forest, "report_result1_genre_forest_plot.png", width = 14, height = 9)

# ========== RESULT 2: POPULARITY – SMOOTH DENSITY RIBBON + MEAN LINE ==========

# Create density ribbons for each quartile
pop_density_data <- movies_analysis %>%
  select(imdb_rating, votes_group) %>%
  drop_na()

p_result2_ribbon <- ggplot(pop_density_data, aes(x = imdb_rating, fill = votes_group, colour = votes_group)) +
  geom_density(alpha = 0.35, size = 1.1, bw = 0.35) +
  geom_vline(aes(xintercept = mean(imdb_rating)), 
             linetype = "dashed", colour = "#333333", size = 0.9, alpha = 0.5) +
  scale_fill_manual(
    values = c("Q1 (Low)" = "#F8B500", "Q2 (Med-Low)" = "#FF8C00",
               "Q3 (Med-High)" = "#E63946", "Q4 (High)" = "#2E86AB"),
    name = "Popularity Quartile"
  ) +
  scale_colour_manual(
    values = c("Q1 (Low)" = "#D4AF00", "Q2 (Med-Low)" = "#E67E00",
               "Q3 (Med-High)" = "#C6273D", "Q4 (High)" = "#1B5A9F"),
    guide = "none"
  ) +
  labs(
    title = "Result 2: Rating Distributions by Popularity Quartile",
    subtitle = "Kernel density estimation with bandwidth = 0.35. Clear rightward shift from Q1 to Q4.",
    x = "IMDb Rating (1–10)",
    y = "Density",
    caption = "Spearman ρ = 0.27 (p < 0.001). Popularity is the strongest single predictor of rating."
  ) +
  theme_professional() +
  theme(
    panel.grid.major.y = element_line(colour = "#F0F0F0", size = 0.3),
    legend.position = c(0.72, 0.75),
    legend.background = element_rect(fill = "white", colour = "#CCCCCC", size = 0.5)
  )

print(p_result2_ribbon)

save_plot(p_result2_ribbon, "report_result2_popularity_density_ribbon.png", width = 14, height = 8)

# ========== RESULT 2b: POPULARITY – VIOLIN + MEAN + QUARTILE STATS ==========

pop_stats_for_viz <- movies_analysis %>%
  group_by(votes_group) %>%
  mutate(
    votes_group = factor(votes_group, 
                         levels = c("Q1 (Low)", "Q2 (Med-Low)", "Q3 (Med-High)", "Q4 (High)"))
  ) %>%
  ungroup()

p_result2_violin <- ggplot(pop_stats_for_viz, 
                           aes(x = votes_group, y = imdb_rating, fill = votes_group)) +
  geom_violin(alpha = 0.65, colour = NA) +
  geom_boxplot(width = 0.15, alpha = 0.7, fill = "white", colour = "#333333", size = 0.8) +
  stat_summary(fun = mean, geom = "point", size = 4, colour = "#C41E3A", alpha = 0.9) +
  stat_summary(fun.data = "mean_se", geom = "linerange", colour = "#C41E3A", 
               size = 1.2, width = 0.08, alpha = 0.8) +
  scale_fill_manual(
    values = c("Q1 (Low)" = "#F8B500", "Q2 (Med-Low)" = "#FF8C00",
               "Q3 (Med-High)" = "#E63946", "Q4 (High)" = "#2E86AB"),
    guide = "none"
  ) +
  labs(
    title = "Result 2: Rating Distributions with Mean ± SE",
    subtitle = "Violin (distribution) + boxplot (IQR) + mean (red dot). ANOVA F(3,3779) = 187.4, p < 0.001.",
    x = "Popularity Quartile (by vote count)",
    y = "IMDb Rating (1–10)",
    caption = "Clear separation of distributions. Q4 films significantly rated higher than Q1–Q3."
  ) +
  theme_professional() +
  theme(
    panel.grid.major.y = element_line(colour = "#F0F0F0", size = 0.3),
    axis.text.x = element_text(size = 11)
  )

print(p_result2_violin)

save_plot(p_result2_violin, "report_result2_popularity_violin_means.png", width = 13, height = 8)

# ========== RESULT 3: MULTI-GENRE – AREA CHART + CUMULATIVE SUCCESS ==========

multi_genre_progression <- movies_analysis %>%
  mutate(
    genre_bin = case_when(
      num_genres == 1 ~ "1",
      num_genres == 2 ~ "2",
      num_genres == 3 ~ "3",
      num_genres >= 4 ~ "4+"
    ),
    genre_bin = factor(genre_bin, levels = c("1", "2", "3", "4+"))
  ) %>%
  group_by(genre_bin) %>%
  summarise(
    mean_rating = mean(imdb_rating),
    dual_success_pct = mean(dual_success) * 100,
    financial_success_pct = mean(financial_success) * 100,
    critical_success_pct = mean(critical_success) * 100,
    n = n(),
    .groups = "drop"
  ) %>%
  pivot_longer(cols = ends_with("_pct"),
               names_to = "metric",
               values_to = "percentage") %>%
  mutate(
    metric = factor(metric, 
                    levels = c("financial_success_pct", "critical_success_pct", "dual_success_pct"),
                    labels = c("Financial Only", "Critical Only", "Dual Success"))
  )

p_result3_area <- ggplot(multi_genre_progression, 
                         aes(x = genre_bin, y = percentage, fill = metric, group = metric)) +
  geom_area(position = "identity", alpha = 0.55, colour = "#333333", size = 0.8) +
  geom_line(position = "identity", size = 1.2, colour = "#333333") +
  geom_point(position = "identity", size = 3, colour = "white", stroke = 1.5) +
  scale_fill_manual(
    values = c("Financial Only" = "#2E86AB", "Critical Only" = "#A23B72", "Dual Success" = "#F18F01"),
    name = "Success Type"
  ) +
  labs(
    title = "Result 3: Success Metrics by Number of Genres",
    subtitle = "Area chart showing stacked success rates. Multi-genre films achieve higher dual success.",
    x = "Number of Genres",
    y = "Success Rate (%)",
    caption = "t-test: Multi vs Single, t = 5.73, p < 0.001. ANOVA F(3, 7779) = ..., p < 0.001."
  ) +
  theme_professional() +
  theme(
    panel.grid.major.y = element_line(colour = "#E8E8E8", size = 0.4),
    legend.position = "top",
    legend.direction = "horizontal"
  )

print(p_result3_area)

save_plot(p_result3_area, "report_result3_multi_genre_area.png", width = 13, height = 8)

# ========== RESULT 4: TEMPORAL – SLOPE CHART + SHADED PERIOD BACKGROUND ==========

genre_trends_for_slope <- genre_trends_clean %>%
  filter(Genre %in% key_genres) %>%
  pivot_longer(
    cols = matches("dual_success_rate"),
    names_to = "period_col",
    values_to = "dual_success_rate"
  ) %>%
  mutate(
    period = if_else(str_detect(period_col, "2016"), 1, 2),
    period_label = if_else(period == 1, "1996–2016\n(Pre-Streaming)", "2017–2024\n(Streaming Era)"),
    period_label = factor(period_label, levels = c("1996–2016\n(Pre-Streaming)", "2017–2024\n(Streaming Era)"))
  ) %>%
  select(Genre, period, period_label, dual_success_rate)

p_result4_slope <- ggplot(genre_trends_for_slope,
                          aes(x = period, y = dual_success_rate, group = Genre, colour = Genre)) +
  annotate("rect", xmin = 0.7, xmax = 1.3, ymin = -Inf, ymax = Inf,
           fill = "#E8E8E8", alpha = 0.3) +
  annotate("rect", xmin = 1.7, xmax = 2.3, ymin = -Inf, ymax = Inf,
           fill = "#2E86AB", alpha = 0.08) +
  geom_line(size = 1.8, alpha = 0.85, linejoin = "round") +
  geom_point(size = 6, alpha = 0.95, shape = 21, colour = "white", stroke = 2) +
  geom_text(aes(label = sprintf("%.1f%%", dual_success_rate)),
            hjust = -0.3, vjust = -0.8, size = 4, fontface = "bold",
            show.legend = FALSE) +
  scale_colour_viridis(discrete = TRUE, option = "turbo", name = "Genre") +
  scale_x_continuous(breaks = c(1, 2), labels = c("1996–2016\n(Pre-Streaming)", "2017–2024\n(Streaming Era)")) +
  scale_y_continuous(limits = c(40, 95), breaks = seq(40, 90, 10)) +
  labs(
    title = "Result 4: Genre Success Trajectory Across Streaming Transition",
    subtitle = "Documentary surge (+9.3 pp); Crime enters top-5; streaming platforms reshaping success definitions.",
    x = "Time Period",
    y = "Dual Success Rate (%)",
    caption = "Grey region: Pre-streaming era (traditional theatrical model). Blue region: Streaming era (Netflix/Amazon/etc. investment)."
  ) +
  theme_professional() +
  theme(
    panel.grid.major.y = element_line(colour = "#E0E0E0", size = 0.4),
    panel.grid.major.x = element_blank(),
    legend.position = "right",
    axis.text.x = element_text(size = 10, face = "bold")
  )

print(p_result4_slope)

save_plot(p_result4_slope, "report_result4_temporal_slope_styled.png", width = 14, height = 9)

# ========== RESULT 5: LINEAR REGRESSION – COEFFICIENT PLOT + SIGNIFICANCE SHADING ==========

lm_plot_enhanced <- lm_tidy %>%
  filter(term != "(Intercept)") %>%
  mutate(
    term = reorder(term, estimate),
    ci_lower = estimate - 1.96 * std_error,
    ci_upper = estimate + 1.96 * std_error
  )

p_result5_coef_enhanced <- ggplot(lm_plot_enhanced, aes(x = term, y = estimate)) +
  geom_rect(data = filter(lm_plot_enhanced, p_value < 0.05),
            aes(xmin = as.numeric(term) - 0.4, xmax = as.numeric(term) + 0.4),
            ymin = -Inf, ymax = Inf, fill = "#2E86AB", alpha = 0.08, 
            inherit.aes = FALSE) +
  geom_hline(yintercept = 0, linetype = "solid", colour = "#333333", size = 1) +
  geom_errorbar(aes(ymin = ci_lower, ymax = ci_upper, colour = estimate > 0),
                width = 0.15, size = 1.3, alpha = 0.85) +
  geom_point(aes(colour = estimate > 0, size = -log10(p_value)), alpha = 0.95) +
  geom_text(aes(label = sprintf("%.3f", estimate), colour = estimate > 0),
            vjust = -1.2, size = 4, fontface = "bold") +
  coord_flip() +
  scale_colour_manual(values = c("TRUE" = "#2E86AB", "FALSE" = "#E63946"), guide = "none") +
  scale_size(name = "-log10(p-value)", breaks = c(1, 2, 3, 10)) +
  labs(
    title = "Result 5: Linear Regression Coefficients – Rating Prediction Model",
    subtitle = "R² = 0.1649 (16.5% variance explained). F(7, 3775) = 106.49, p < 0.001. Blue shading: p < 0.05.",
    x = "Predictor",
    y = "Coefficient (β) with 95% CI",
    caption = "logVotes (popularity) & Horror (negative) are strongest predictors. Positive genre dummies increase ratings; Horror penalty largest."
  ) +
  theme_professional() +
  theme(
    panel.grid.major.x = element_line(colour = "#E0E0E0", size = 0.3),
    panel.grid.major.y = element_blank(),
    legend.position = "bottom"
  )

print(p_result5_coef_enhanced)

save_plot(p_result5_coef_enhanced, "report_result5_lm_coefficient_enhanced.png", width = 14, height = 9)

# ========== RESULT 6: LOGISTIC REGRESSION – ODDS RATIO FOREST ==========

glm_plot_enhanced <- glm_tidy %>%
  filter(term != "(Intercept)") %>%
  mutate(
  term = reorder(term, odds_ratio),
  log_odds_ratio = log(odds_ratio),
  ci_lower = exp(estimate - 1.96 * std_error),
  ci_upper = exp(estimate + 1.96 * std_error),
  log_ci_lower = log(ci_lower),
  log_ci_upper = log(ci_upper),
  significant = p_value < 0.05
)

p_result6_odds <- ggplot(glm_plot_enhanced, aes(x = term, y = log_odds_ratio, colour = significant)) +
  geom_hline(yintercept = 0, linetype = "solid", colour = "#333333", size = 1.2) +
  geom_errorbar(aes(ymin = log_ci_lower, ymax = log_ci_upper),
              width = 0.25, size = 1.4, alpha = 0.8) +
  geom_point(size = 6, alpha = 0.95, shape = 21, 
           colour = "white", stroke = 1.5) +
  geom_text(aes(label = sprintf("%.3f", odds_ratio)),
          vjust = -1.2, size = 4, fontface = "bold",
          colour = "#333333") +
  scale_colour_manual(
    values = c("TRUE" = "#2E86AB", "FALSE" = "#CCCCCC"),
    name = "Significant\n(p < 0.05)",
    labels = c("TRUE" = "Yes", "FALSE" = "No")
  ) +
  scale_y_continuous(
  breaks = log(c(0.1, 0.25, 0.5, 1, 2, 4)),
  labels = c("0.1", "0.25", "0.5", "1.0", "2.0", "4.0"),
  name = "Odds Ratio (log scale)"
  ) +
  coord_flip() +
  labs(
    title = "Result 6: Logistic Regression – Odds Ratios for Dual Success",
    subtitle = "logVotes increases odds ~10% per unit; Horror decreases ~62%. 95% CI shown.",
    x = "Predictor",
    y = "Odds Ratio (log scale)",
    caption = "OR > 1 (right of 0): increases dual success probability. OR < 1 (left of 0): decreases. Grey points: not significant."
  ) +
  theme_professional() +
  theme(
    panel.grid.major.x = element_line(colour = "#E0E0E0", size = 0.3),
    panel.grid.major.y = element_blank(),
    legend.position = "bottom"
  )

print(p_result6_odds)

save_plot(p_result6_odds, "report_result6_logistic_odds_forest.png", width = 14, height = 9)